clear all 
set more off 
set maxvar 15000 
clear matrix


* Number of iterations
    local iter "200"

* Dataset to be filled in 
    set obs `iter'
    gen obs_id = _n
    
    expand 7
    bysort obs_id: gen decade = 1900+ (_n*10)
    
    foreach y in IGE rank {
        gen `y'_baseline=. 
        gen `y'_baseline_lb=.
        gen `y'_baseline_ub=.
        
        gen `y'_recall=.
    }
    
    tempfile data1 
    save `data1'

* Simulation

    forval y=1(1)`iter' {

        noisily display "Iteration:`y'"
        
	use "$Mydirectory1/3_Output/2_PooledData_analysis.dta", clear 
        keep if baseline_sample==1 
        
        count
        local total_obs = `r(N)'
        
        keep fatheroccej race south_merge dob wgt_sex_race log_son_baseline rank_son_baseline age10 decade
        
        /* No self-employed version in "retrospective" of PSID 
           so change 21 --> 28 */
        replace fatheroccej=28 if fatheroccej==21
        replace fatheroccej=68 if fatheroccej==65

    *---------------------------------------------------*
    /* MERGE IN RECALL MATRIX AND ASSIGN NEW OCCUPATIONS
       BASED ON DISTRIBUTION OF MISTAKES */
    *---------------------------------------------------*

        gen person_id = _n //keeps the observation number 
        
        joinby fatheroccej using "$PSID/output/PSID_recall_matrix_long.dta"
        
        * Sort by person and father share (in descending order)
        gsort + person_id - father_share
        
        by person_id: gen father_share_cumul = sum(father_share)
        
        * Re-assign occupation using uniform random variable
        local number_seed = 61122 + (`y'*100)
        set seed `number_seed'

        by person_id: gen firstobs= _n==1
        by person_id: gen lastobs= _n==_N
        
        generate u1 = runiform(0, 100) if firstobs==1
        by person_id: replace u1 = u1[_n-1] if u1==.
        
        gen fatheroccej_new = .
        
        replace fatheroccej_new = father_occ if u1<= father_share_cumul & firstobs==1 //for most common occupation
        replace fatheroccej_new = father_occ if firstobs!=1 & u1<=father_share_cumul & u1>father_share_cumul[_n-1] //for remaining occs in the middle of the pack
        replace fatheroccej_new = father_occ if lastobs==1 & u1>father_share_cumul
        
        keep if fatheroccej_new!=.
        
        * Make sure there's still the same # of observations
        count
        assert `r(N)'==`total_obs'
        
        gen match = fatheroccej == fatheroccej_new
        tab match   
        
    *---------------------------------*
    * ASSIGN NEW INCOME SCORES
    *---------------------------------*

    * Census income scores
        preserve 
        use "$CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta", clear
        keep fatheroccej race south_merge avg_HHinc_1940_byrace_bysouth avg_HHinc_*_byocc_byr_bys
        
        tempfile census
        save `census'
        
        restore
        
    * 1936 income scores
        preserve 
        use "$Survey1936/output/ConsumptionSurvey_1936_IncomeScores.dta", clear
        keep fatheroccej race south_merge avg_totfaminc_1936
        
        tempfile survey1936
        save `survey1936'
        
        restore
        
    * 1900 income scores
        preserve 
        use "$Mydirectory1/1_DataSources/1900_IncomeScores/output/IncomeScores_1900_byrace_bysouth.dta", clear
        rename occ1950ej_PH fatheroccej
        
        keep fatheroccej race south_merge netearn00_adj_byr_bys income_PH_farmfix_byr_bys
        
        tempfile inc1900
        save `inc1900'
        
        restore
        
        /* Reassign income scores for original and adjusted 
           occupations (because of changes in 21/28 and 61/65) */ 
        foreach m in og new {
      
        *Merge income scores
            merge m:1 fatheroccej race south_merge using `census', assert(2 3) keep(3) nogen
            merge m:1 fatheroccej race south_merge using `survey1936', assert(2 3) keep(3) nogen
            merge m:1 fatheroccej race south_merge using `inc1900', assert(2 3) keep(3) nogen
            
            * Create 1900 and 1940 income scores 
            gen avg_HHinc_1900_byocc_byr_bys = income_PH_farmfix_byr_bys
            replace avg_HHinc_1900_byocc_byr_bys = netearn00_adj_byr_bys if fatheroccej==81

            gen avg_HHinc_1940_byocc_byr_bys = avg_HHinc_1940_byrace_bysouth
            replace avg_HHinc_1940_byocc_byr_bys = avg_totfaminc_1936 if fatheroccej==81 | fatheroccej==21
            
            * Now log measures 
            foreach x in 1900 1940 1960 1970 1980 1990 {
                gen log_father_`x'_byr_bys = log(avg_HHinc_`x'_byocc_byr_bys)
            }
            
        * Create interpolated measures

            gen log_father_interpolated_`m'=.
            
            /* 1910-1930 cohorts---give them a weighted average
               of 1900 and 1940 income scores */
            forval i=1920(1)1940 {
                local j = `i'-1900 //# years away from 1900
                local k = 40-`j' //# years away from 1940
                
                quietly replace log_father_interpolated_`m'= ((`k'/40)*log_father_1900_byr_bys) + ((`j'/40)*log_father_1940_byr_bys) if age10==`i'
            }
     
            /* 1931-1950 cohorts---give them a weighted average
               of 1940 and 1960 income scores */
            forval i=1941(1)1960 {
                local j=`i'-1940 //# years away from first decade
                local k=20-`j' //# years away from second decade
                
                quietly replace log_father_interpolated_`m' = ((`k'/20)*log_father_1940_byr_bys) + ((`j'/20)*log_father_1960_byr_bys) if age10==`i'
            }   

            /* 1951-1979 birth cohorts: Give them a weighted average
                                        of two Censuses closest to when 
                                        the survey respondent turned 10. */
            foreach decade1 in 6 7 8 {
                local decade2 = `decade1'+1
                
                forval i=19`decade1'1(1)19`decade2'0 {
                    local j=`i'-19`decade1'0 //# years away from first decade
                    local k=10-`j' //# years away from second decade
                    
                    quietly replace log_father_interpolated_`m' = ((`k'/10)*log_father_19`decade1'0_byr_bys) + ((`j'/10)*log_father_19`decade2'0_byr_bys) if age10==`i'
                }
            }
            
            drop avg_HHinc_1940_byrace_bysouth- log_father_1990_byr_bys

            if "`m'"=="og" {
                rename fatheroccej fatheroccej_og
                rename fatheroccej_new fatheroccej 
            }
                 
        }
            
    * Create ranks
        foreach m in og new {
            egen rank_father_interpolated_`m' = xtile(log_father_interpolated_`m'), by(dob) nq(100) weight(wgt_sex_race)
        } 
            
            
    * Regressions
        forval i=1(1)7 {
        
            * IGE
            quietly reg log_son_baseline log_father_interpolated_og  if decade==19`i'0 [aw=wgt_sex_race], robust
            
            local ige1_`i' = _b[log_father_interpolated_og] 
            local ige1_`i'_se = _se[log_father_interpolated_og] 
            
            quietly reg log_son_baseline log_father_interpolated_new  if decade==19`i'0 [aw=wgt_sex_race], robust
            
            local ige2_`i' = _b[log_father_interpolated_new]
            
            * Ranks
            quietly reg rank_son_baseline rank_father_interpolated_og if decade==19`i'0 [aw=wgt_sex_race], robust
            
            local rank1_`i' = _b[rank_father_interpolated_og]
            local rank1_`i'_se = _se[rank_father_interpolated_og]
            
            quietly reg rank_son_baseline rank_father_interpolated_new if decade==19`i'0 [aw=wgt_sex_race], robust
            
            local rank2_`i' = _b[rank_father_interpolated_new]
            
        }
        
        * Save in master dataset 
            use `data1', clear
            
            forval i=1(1)7 {
                replace IGE_baseline = `ige1_`i'' if obs_id==`y' & decade==19`i'0
                replace IGE_baseline_lb = `ige1_`i''-(1.96*`ige1_`i'_se') if obs_id==`y' & decade==19`i'0
                replace IGE_baseline_ub = `ige1_`i''+(1.96*`ige1_`i'_se') if obs_id==`y' & decade==19`i'0

                replace rank_baseline = `rank1_`i'' if obs_id==`y' & decade==19`i'0
                replace rank_baseline_lb = `rank1_`i''-(1.96*`rank1_`i'_se') if obs_id==`y' & decade==19`i'0
                replace rank_baseline_ub = `rank1_`i''+(1.96*`rank1_`i'_se') if obs_id==`y' & decade==19`i'0
                
                replace IGE_recall = `ige2_`i'' if obs_id==`y' & decade==19`i'0
                replace rank_recall = `rank2_`i'' if obs_id==`y' & decade==19`i'0
        }
            
        tempfile data1
        save `data1'
        
    }

* Grab mean from simulation for each cohort
    gen IGE_recall_mean = .
    gen rank_recall_mean =.
    
    forval i=1(1)7 {
        sum IGE_recall if decade==19`i'0, 
        replace IGE_recall_mean = `r(mean)' if decade==19`i'0 
        
        sum rank_recall if decade==19`i'0 
        replace rank_recall_mean = `r(mean)' if decade==19`i'0  
    }
    
* Grab 2.5 and 97.5 percentiles for coverage 
    gen IGE_recall_ub = .
    gen rank_recall_ub =.
    gen IGE_recall_lb = .
    gen rank_recall_lb =.
    
    foreach y in IGE rank {
        forval i=1(1)7 {    
            xtile `y'_pct = `y'_recall if decade==19`i'0, nq(100)
            
            sum `y'_recall if `y'_pct>=2 & `y'_pct<=3 & decade==19`i'0
            replace `y'_recall_lb = `r(mean)' if decade==19`i'0
            
            sum `y'_recall if `y'_pct>=97 & `y'_pct<=98 & decade==19`i'0
            replace `y'_recall_ub = `r(mean)' if decade==19`i'0
            
            drop `y'_pct 
        }
    }

    
    bysort decade: keep if _n==1
    gen decade2 = decade+2
    
    #delimit ;
    twoway (scatter IGE_baseline decade, lpat(solid) lcolor(midblue) msymbol(circle) mcolor(midblue) msize(medium)) 
           (rcap IGE_baseline_ub IGE_baseline_lb decade, lpatter(solid) lcolor(midblue) yaxis(1) lwidth(0.5))  
           (scatter IGE_recall_mean decade2, lpat(dash) lcolor(navy) mcolor(navy) msymbol(circle) mcolor(navy) msize(medium))
           (rcap IGE_recall_ub IGE_recall_lb decade2, lpatter(solid) lcolor(navy) yaxis(1) lwidth(0.5)), 
           legend(on ring(0) pos(8) row(2) order(1 "Baseline" 3 "Recall adjusted")) 
           yscale(range(0 1)) ylabel(0(0.25)1) xlabel(1910(10)1970) xscale(range(1905 1975))
           xti(" " "Decade of respondent's birth") yti("IGE coefficient" " ", axis(1))
           xlabel(1910 "1910s" 1920 "1920s" 1930 "1930s" 1940 "1940s" 1950 "1950s" 1960 "1960s" 1970 "1970s", labsize(small) );
    #delimit cr
    graph export "$Mydirectory2/appendix_b/Recall_sim_ige.pdf", as(pdf) replace

    #delimit ;
    twoway (scatter rank_baseline decade, lpat(solid) lcolor(orange) msymbol(diamond) mcolor(orange) msize(medium)) 
           (rcap rank_baseline_ub rank_baseline_lb decade, lpatter(solid) lcolor(orange) yaxis(1) lwidth(0.5))
           (scatter rank_recall_mean decade2, lpat(dash) lcolor(erose) mcolor(erose) msymbol(diamond) mcolor(erose) msize(medium))
           (rcap rank_recall_ub rank_recall_lb decade2, lpatter(solid) lcolor(erose) yaxis(1) lwidth(0.5)), 
           legend(on ring(0) pos(8) row(2) order(1 "Baseline" 3 "Recall adjusted")) 
           yscale(range(0 0.45)) ylabel(0(0.15)0.45) xlabel(1910(10)1970) xscale(range(1905 1975))
           xti(" " "Decade of respondent's birth") yti("Rank coefficient" " ", axis(1))
           xlabel(1910 "1910s" 1920 "1920s" 1930 "1930s" 1940 "1940s" 1950 "1950s" 1960 "1960s" 1970 "1970s", labsize(small) );
    #delimit cr
    graph export "$Mydirectory2/appendix_b/Recall_sim_rank.pdf", as(pdf) replace

